Data Loading

Load required libraries.

library(arm)
library(boot)
library(ggplot2)
library(grid)
library(dplyr)
library(tidyr)
library(RMySQL)
library(RCurl)
source('data_loading.R')

Load in Wordbank common data.

wordbank <- src_mysql(host = "54.149.39.46", dbname="wordbank",
                      user = "wordbank", password = "wordbank")

common.tables <- get.common.tables(wordbank)

admins <- get.administration.data(common.tables$momed,
                                  common.tables$child,
                                  common.tables$instrumentsmap,
                                  common.tables$administration) %>%
  filter(form == "WS", age > 15, age < 32) %>%
  mutate(age.group = cut(age, breaks = c(15, 19, 23, 27, 31)))

items <- get.item.data(common.tables$wordmapping,
                       common.tables$instrumentsmap,
                       common.tables$category)

instrument.tables <- get.instrument.tables(wordbank, common.tables$instruments)

grammar.languages <- unique(filter(instrument.tables, has_grammar == 1)$language)
languages <- unique(instrument.tables$language)

Show number of items in each relevant section.

sections <- items %>%
  filter(form == "WS") %>%
  group_by(language, type) %>%
  summarise(n = n()) %>%
  spread(type, n) %>%
  select(language, word, word_form, complexity)
sections[is.na(sections)] = 0
kable(sections)
language word word_form complexity
Cantonese 804 0 0
Croatian 717 0 0
Danish 725 29 33
English 680 25 37
German 588 0 0
Hebrew 622 0 0
Italian 670 0 0
Mandarin 799 0 0
Norwegian 731 33 42
Russian 728 0 0
Spanish 680 24 37
Swedish 710 0 0
Turkish 711 0 0

Show total number of administrations in each language.

n.admin <- admins %>%
  group_by(language) %>%
  summarise(n = n())
kable(n.admin)
language n
Cantonese 987
Croatian 377
Danish 3038
English 5595
German 1183
Hebrew 253
Italian 658
Mandarin 1056
Norwegian 10095
Russian 773
Spanish 1094
Swedish 900
Turkish 1861

Show number of administrations in each language by age group.

n.admin.age <- admins %>% 
  group_by(language, age.group) %>% 
  summarise(n = n()) %>%
  spread(age.group, n)
kable(n.admin.age)
language (15,19] (19,23] (23,27] (27,31]
Cantonese 238 282 262 205
Croatian 81 105 114 77
Danish 725 857 736 720
English 2020 947 1241 1387
German 164 364 380 275
Hebrew 62 179 12 NA
Italian 70 227 194 167
Mandarin 283 282 281 210
Norwegian 1236 2892 3176 2791
Russian 83 195 239 256
Spanish 260 322 290 222
Swedish 311 307 143 139
Turkish 465 500 460 436

Some utility functions for transforming data values.

get.coded.type <- function(type, complexity_category) {
  if (type == "complexity") {
    return(complexity_category)
    } else {
      return(type)
      }
  }

get.value <- function(type, value) {
  if (type == "word_form" | type == "word") {
    return(value == "produces")
    } else if (type == "complexity") {
      return(value == "complex")
      }
  }

Analysis 1: Syntax and Morphology

Get kid by item data for wordform and complexity items all languages and aggregate them.

get.grammar.data <- function(lang) {
  
  lang.grammar.items <- items %>%
    filter(language == lang, form == "WS",
           type == "word_form" | type == "complexity") %>%
    rename(column = item.id) %>%
    mutate(item.id = as.numeric(substr(column, 6, nchar(column)))) %>%
    select(column, item.id, type, item, definition, complexity_category)
  
  lang.instrument.table <- filter(instrument.tables, language == lang,
                                  form == "WS")$table[[1]]
  
  lang.grammar.data <- get.instrument.data(lang.instrument.table,
                                           lang.grammar.items$column) %>%
    left_join(lang.grammar.items) %>%
    group_by(data_id, type) %>%
    mutate(no_section = all(is.na(value))) %>%
    filter(!no_section) %>%
    mutate(value = ifelse(is.na(value), "", value),
           value = get.value(unique(type), value),
           coded.type = get.coded.type(unique(type), complexity_category),
           coded.type = factor(coded.type,
                               levels = c("word_form", "morphology", "syntax"),
                               labels = c("Word Form",
                                          "Complexity (Morphological)",
                                          "Complexity (Syntactic)")),
           measure = factor(type, levels = c("word_form", "complexity"),
                            labels = c("Word Form", "Complexity"))) %>%
    ungroup() %>%
    select(-complexity_category, -no_section, -type, -column)
  
  num.words <- nrow(filter(items, language == lang, form == "WS", type == "word"))
  
  lang.admins <- admins %>%
    filter(language == lang) %>%
    select(data_id, age, age.group, production, language) %>%
    mutate(vocab.mean = production / num.words)
  
  lang.data <- left_join(lang.grammar.data, lang.admins) %>%
    filter(age > 15 & age < 32)
  
  return(lang.data)
  
  }

grammar.data <- bind_rows(sapply(grammar.languages, get.grammar.data, simplify = FALSE))

Get by kid summary data for all languages.

grammar.summary <- grammar.data %>%
  group_by(language, measure, data_id, age, age.group, vocab.mean) %>%
  summarise(sum = sum(value),
            diff = length(value) - sum,
            mean = sum / length(value))

Fit grammar score models and use them to predict data.

grammar.models <- grammar.summary %>%
  group_by(language, measure) %>%
  do(model = glm(cbind(sum, diff) ~ vocab.mean + age.group,
                 data = ., family="binomial"))

get.grammar.model <- function(lang, meas) {
  return(filter(grammar.models, language == lang, measure == meas)$model[[1]])
  }

grammar.predicted.data <- grammar.summary %>%
  group_by(language, measure) %>%
  mutate(predicted = invlogit(predict.lm(get.grammar.model(unique(language),
                                                           unique(measure)),
                                         data = ., family="binomial")))

Plot score as a function of vocabulary size for each language and measure with model prediction curves.

ggplot(grammar.predicted.data, aes(x = vocab.mean, y = mean, 
                                   colour = age.group, fill = age.group,
                                   label = age.group)) + 
  geom_jitter(alpha=.3, size=.75) +
  geom_line(aes(y=predicted),size=0.65) + 
  facet_grid(language~measure) + 
  scale_x_continuous(limits = c(0,1), breaks = seq(0,1,.2),
                     name = "\nVocabulary Size") + 
  scale_y_continuous(limits = c(0,1), breaks = seq(0,1,.25),
                     "Score (Mean Items)\n") + 
  theme_bw(base_size = 11) +
  theme(legend.position = c(0.06,0.92),
        legend.text = element_text(size=7),
        legend.title = element_text(size=7),
        legend.key.height = unit(0.7, "char"),
        legend.key.width = unit(0.4, "cm"),
        legend.key = element_blank(),
        legend.background = element_rect(fill="transparent"),
        text=element_text(family=font)) +
  scale_color_brewer(type="div", palette=9,
                     name="Age Group\n (months)") +
  scale_fill_brewer(palette = "Spectral",
                    guide=FALSE)

Model comparison: fit grammar models and get their AICs and age coefficients.

grammar.model.metrics <- grammar.summary %>%
  group_by(language, measure) %>%
  do(model.vocab = glm(cbind(sum,diff) ~ vocab.mean, data = .,
                       family="binomial"),
     model.vocab.age = glm(cbind(sum,diff) ~ vocab.mean + age, data = .,
                           family="binomial"),
     model.vocab.age = glm(cbind(sum,diff) ~ vocab.mean * age, data = .,
                           family="binomial")) %>%
  mutate(AIC.vocab = AIC(model.vocab),
         AIC.vocab.age = AIC(model.vocab.age),
         deltaAIC = AIC.vocab - AIC.vocab.age,
         age.coef = coef(model.vocab.age)["age"],
         age.se = se.coef(model.vocab.age)["age"])

Show AICs of grammar models.

kable(select(grammar.model.metrics,
             language, measure, AIC.vocab, AIC.vocab.age, deltaAIC))
language measure AIC.vocab AIC.vocab.age deltaAIC
Danish Word Form 11529.402 11508.696 20.70574
Danish Complexity 13774.059 13324.288 449.77111
English Word Form 18762.575 18664.378 98.19717
English Complexity 30320.285 28695.682 1624.60326
Norwegian Word Form 52851.103 52771.409 79.69365
Norwegian Complexity 77467.584 74612.829 2854.75512
Spanish Word Form 6285.671 6172.703 112.96810
Spanish Complexity 10596.578 9700.317 896.26133

Plot age effect coefficients for each language and measure.

ggplot(grammar.model.metrics, 
       aes(x=language, y=age.coef, fill=measure)) + 
  geom_bar(position="dodge", stat="identity") + 
  geom_linerange(aes(ymin=age.coef-1.96*age.se, ymax=age.coef+1.96*age.se), 
                 position = position_dodge(width=.9)) +
  ylab("Age effect coefficient") + 
  xlab("") +
  theme_bw(base_size = 14) +
  scale_fill_brewer(palette="Set2",
                    name="Measure") +
  theme(legend.position = c(0.15,0.86),
        legend.text = element_text(size=14),
        legend.title = element_text(size=15),
        legend.key.height = unit(1.5, "char"),
        legend.key = element_blank(),
        legend.background = element_rect(fill="transparent"),
        text=element_text(family=font))

Fit models for each wordform and complexity item and get their age coefficients.

item.models <- grammar.data %>%
  group_by(language, item, coded.type) %>%
  do(model = glm(value ~ vocab.mean + age, data = ., family = "binomial")) %>%
  mutate(coef = coef(model)["age"],
         se = se.coef(model)["age"])

Function for plotting age effect coefficients by item for a language.

plot.item.coefs <- function(item.models, lang) {
  
  lang.item.models <- filter(item.models, language == lang) %>%
    arrange(coef) %>%
    mutate(item = factor(item, levels=item))
  
  item.plot <- ggplot(lang.item.models,
                      aes(x=item, y=coef, fill=coded.type, label=item)) +
    geom_bar(stat="identity", position="identity", alpha=.5, width=0.9) +
    geom_linerange(aes(ymin=coef-1.96*se, ymax=coef+1.96*se),
                   position = position_dodge(width=.9)) +
    theme_bw(base_size = 12) +
    scale_y_continuous(name="Age Effect Coefficient") +
    scale_x_discrete(name="",breaks=NULL) +
    annotate("text", x = length(lang.item.models$item)/2,
             y = min(lang.item.models$coef-1.96*lang.item.models$se), vjust=0,
             label = lang, size = 9, family=font) +
    scale_fill_brewer(palette="Set2", name="Item Type", drop=FALSE) +
    theme(legend.position = c(0.22,0.82),
          legend.text = element_text(size=14),
          legend.title = element_text(size=13),
          legend.key = element_blank(),
          legend.key.height = unit(1.5, "char"),
          text = element_text(family=font),
          axis.title.y = element_text(size=16),
          axis.text.y = element_text(size=13))
  
  return(item.plot)
  
  }

Plot item interactions for Norwegian.

plot.item.coefs(item.models, "Norwegian")

Plot item interactions for English.

plot.item.coefs(item.models, "English")

Plot item interactions for Danish.

plot.item.coefs(item.models, "Danish")

Plot item interactions for Spanish.

plot.item.coefs(item.models, "Spanish")


Analysis 2: Vocabulary Composition

Get vocabulary composition data for all languages.

get.vocab.composition <- function(lang) {
  
  lang.vocab.items <- filter(items, language == lang, form == "WS", type == "word") %>%
    filter(lexical_category != "other") %>%
    rename(column = item.id) %>%
    mutate(item.id = as.numeric(substr(column, 6, nchar(column))))
  
  lang.instrument.table <- filter(instrument.tables, language == lang,
                                  form == "WS")$table[[1]]
  
  lang.vocab.data <- get.instrument.data(lang.instrument.table,
                                         lang.vocab.items$column) %>%
    left_join(select(lang.vocab.items, item.id, lexical_category, item, definition)) %>%
    mutate(value = ifelse(is.na(value), "", value),
           value = get.value("word", value))
  
  num.words <- nrow(lang.vocab.items)
  
  lang.admins <- admins %>%
    filter(language == lang) %>%
    select(data_id, age, age.group, language)
  
  lang.vocab.summary <- left_join(lang.vocab.data, lang.admins) %>%
    filter(age > 15 & age < 32) %>%
    group_by(data_id, age, age.group, language, lexical_category) %>%
    summarise(sum = sum(value),
              diff = length(value) - sum,
              mean = sum / length(value))
  
  lang.vocab.sizes <- lang.vocab.summary %>%
    summarise(vocab.mean = sum(sum) / num.words)
  
  return(left_join(lang.vocab.summary, lang.vocab.sizes))
    
  }

vocab.composition <- bind_rows(sapply(languages, get.vocab.composition,
                                      simplify = FALSE)) %>%
  filter(lexical_category != "unknown",
         lexical_category != "other") %>%
  mutate(lexical_category = factor(lexical_category,
                                   levels=c("nouns", "predicates", "function_words"),
                                   labels=c("Nouns", "Predicates", "Function Words")))

Fit vocabulary composition models and use them to predict data.

vocab.models <- vocab.composition %>%
  group_by(language, lexical_category) %>%
  do(model = glm(cbind(sum, diff) ~ vocab.mean,
                 data = ., family = "binomial"))

get.vocab.model <- function(lang, cat) {
  return(filter(vocab.models, language == lang, lexical_category == cat)$model[[1]])
  }

vocab.predicted.data <- vocab.composition %>%
  group_by(language, lexical_category) %>%
  mutate(predicted = inv.logit(predict.lm(get.vocab.model(unique(language),
                                                         unique(lexical_category)),
                                         data = ., family = "binomial")))

Plot vocabulary composition as a function of vocabulary size for each language with model prediction curves.

ggplot(vocab.predicted.data,
       aes(x=vocab.mean, y=mean, colour=lexical_category, label=lexical_category)) +
  geom_jitter(alpha=0.15, size=.75) +
  geom_line(aes(y=predicted),size=0.65) + 
  facet_wrap(~ language) +
  scale_y_continuous(limits = c(0, 1.05), breaks = seq(0,1,.2),
                     name = "Proportion of Category\n") +
  scale_x_continuous(limits = c(0,1), breaks = seq(0,1,.2),
                     name = "\nVocabulary Size") +
  theme_bw(base_size=12) + 
  theme(legend.position = c(0.065,0.95),
        legend.text = element_text(size=9),
        legend.title = element_text(size=9, lineheight=unit(0.8, "char")),
        legend.key.height = unit(0.8, "char"),
        legend.key.width = unit(0.3, "cm"),
        legend.key = element_blank(),
        legend.background = element_rect(fill="transparent"),
        text = element_text(family=font)) +
  scale_color_brewer(palette = "Set2", name = "Lexical Category")

Same plot but with loess curves fit instead.

ggplot(vocab.composition,
       aes(x=vocab.mean, y=mean, colour=lexical_category)) +
#  geom_jitter(alpha=0.15, size=.75) +
  geom_smooth(size=.65, method = "loess", control = loess.control(surface="direct")) +
  facet_wrap(~ language) +
  scale_y_continuous(limits = c(0, 1), breaks = seq(0,1,.2),
                     name = "Proportion of Category\n") +
  scale_x_continuous(limits = c(0,1), breaks = seq(0,1,.2),
                     name = "\nVocabulary Size") +
  geom_abline(slope=1, intercept=0, color="gray", linetype = "dashed") + 
  theme_bw(base_size=12) + 
  theme(legend.position = c(0.068,0.95),
        legend.text = element_text(size=9),
        legend.title = element_text(size=9, lineheight=unit(0.8, "char")),
        legend.key.height = unit(0.8, "char"),
        legend.key.width = unit(0.3, "cm"),
        legend.key = element_blank(),
        legend.background = element_rect(fill="transparent"),
        text = element_text(family=font)) +
  scale_color_brewer(palette = "Set1", name = "Lexical Category")

Now separated by lexical category.

ggplot(vocab.composition,
       aes(x=vocab.mean, y=mean, colour=language)) +
  geom_jitter(alpha=0.15, size=.75) +
  geom_smooth(size=.65, method = "loess") +
  facet_wrap(~ lexical_category) +
  scale_y_continuous(limits = c(0, 1), breaks = seq(0,1,.2),
                     name = "Proportion of Category\n") +
  scale_x_continuous(limits = c(0,1), breaks = seq(0,1,.2),
                     name = "\nVocabulary Size") +
  geom_abline(slope=1, intercept=0, color="gray", linetype = "dashed") + 
  theme_bw(base_size=12) + 
  theme(legend.position = c(0.05,0.84),
        legend.text = element_text(size=9),
        legend.title = element_text(size=9, lineheight=unit(0.8, "char")),
        legend.key.height = unit(0.8, "char"),
        legend.key.width = unit(0.3, "cm"),
        legend.key = element_blank(),
        legend.background = element_rect(fill="transparent"),
        text = element_text(family=font)) +
  scale_color_brewer(palette = "Spectral", name = "Language")

For each language and lexical category, find the point in vocabulary size at which the proportion predicted by the loess model is farthest from the diagonal, as well the value of that distance and an estimate of the total area under the curve.

get_predictions <- function(model, pts) {
  predictions <- predict(model, data.frame(vocab.mean=pts))
  predictions[1] <- 0.0
  predictions[length(predictions)] <- 1.0
  return(predictions)
}

get_max_pos <- function(predictions, pts) {
  dists <- predictions - pts
  far.point <- which.max(abs(dists))
  far.point.pos <- pts[far.point]
  return(far.point.pos)
}

get_max_val <- function(predictions, pts) {
  dists <- predictions - pts
  far.point <- which.max(abs(dists))
  far.point.val <- predictions[far.point]
  return(far.point.val)
}

get_area <- function(predictions, pts, width) {
  dists <- predictions - pts
  first_dists <- dists[1:length(dists)-1]
  second_dists <- dists[2:length(dists)]
  mid_dists <- (first_dists + second_dists) / 2
  area <- sum(mid_dists * width)
  return(area)
}

model <- loess(mean ~ vocab.mean, data = filter(vocab.composition, language == "English",
                                                lexical_category == "Nouns"),
               control = loess.control(surface="direct"))

width <- 0.01
pts <- seq(0, 1, width)
dists <- vocab.composition %>%
  group_by(language, lexical_category) %>%
  do(predictions = get_predictions(loess(mean ~ vocab.mean, data = .,
                                         control = loess.control(surface="direct")),
                                   pts)) %>%
  mutate(dist_pos = get_max_pos(predictions, pts),
         dist_val = get_max_val(predictions, pts),
         area = get_area(predictions, pts, width))

# get_group_predictions <- function(lang, lexcat) {
#   values <- filter(dists, language == lang, lexical_category == lexcat)$predictions[[1]]
#   predictions <- data.frame(language = lang,
#                             lexical_category = lexcat,
#                             vocab.mean = pts,
#                             mean = values)
#   return(predictions)
#   }
# 
# predicted_data <- bind_rows(map2(dists$language, dists$lexical_category, get_group_predictions))

Plot point of maximum distance against estimate of area.

ggplot(dists, aes(x = dist_pos, y = area, color = lexical_category)) +
  geom_text(aes(label = language), size = 3) +
  geom_hline(aes(y = 0), color = "gray", linetype = "dashed") +
  scale_colour_brewer(palette = "Set1", name = "") +
  theme_bw() +
  theme(text = element_text(family = font)) +
  scale_x_continuous(name = "\nPosition of maximum distance", limits = c(0,1.05)) +
  scale_y_continuous(name = "Estimate of area\n")

Fit vocabulary composition models and get their AICs and age coefficients.

vocab.model.metrics <- vocab.composition %>%
  group_by(language, lexical_category) %>%
  do(model.vocab = glm(cbind(sum, diff) ~ vocab.mean, data = .,
                       family="binomial"),
     model.vocab.age = glm(cbind(sum, diff) ~ vocab.mean + age, data = .,
                           family="binomial")) %>%
  mutate(AIC.vocab = AIC(model.vocab),
         AIC.vocab.age = AIC(model.vocab.age),
         deltaAIC = AIC.vocab - AIC.vocab.age,
         age.coef = coef(model.vocab.age)["age"],
         age.se = se.coef(model.vocab.age)["age"])

Show AICs of vocabulary composition models.

kable(select(vocab.model.metrics,
             language, lexical_category, AIC.vocab, AIC.vocab.age, deltaAIC))
language lexical_category AIC.vocab AIC.vocab.age deltaAIC
Cantonese Nouns 14482.245 14476.664 5.5806218
Cantonese Predicates 12167.088 12129.923 37.1656096
Cantonese Function Words 7410.952 7319.606 91.3462320
Croatian Nouns 6413.677 6301.574 112.1028614
Croatian Predicates 4160.954 4162.883 -1.9287501
Croatian Function Words 4354.155 4344.532 9.6228351
Danish Nouns 53020.682 52731.806 288.8757241
Danish Predicates 27446.347 27348.650 97.6966033
Danish Function Words 27311.024 27176.950 134.0746317
English Nouns 87205.791 87188.745 17.0454475
English Predicates 52356.356 52108.682 247.6735334
English Function Words 53894.048 53590.791 303.2570575
German Nouns 16323.375 16317.121 6.2535216
German Predicates 9489.998 9482.092 7.9059389
German Function Words 11501.880 11450.649 51.2315301
Hebrew Nouns 3314.099 3315.257 -1.1581716
Hebrew Predicates 2947.483 2937.444 10.0380693
Hebrew Function Words 1714.122 1714.836 -0.7135796
Italian Nouns 10058.856 10051.666 7.1900927
Italian Predicates 7036.120 7036.578 -0.4576938
Italian Function Words 7035.737 6838.334 197.4028054
Mandarin Nouns 15228.015 15185.426 42.5891412
Mandarin Predicates 13689.663 13645.529 44.1336671
Mandarin Function Words 12408.554 12268.564 139.9904042
Norwegian Nouns 149736.961 149732.077 4.8837442
Norwegian Predicates 90645.950 90311.919 334.0305313
Norwegian Function Words 104632.747 104123.143 509.6041882
Russian Nouns 13102.704 13096.580 6.1232420
Russian Predicates 8219.628 8180.489 39.1389638
Russian Function Words 8750.349 8322.000 428.3488135
Spanish Nouns 19694.522 19666.290 28.2320243
Spanish Predicates 13328.751 13330.715 -1.9634181
Spanish Function Words 13260.413 13147.895 112.5177552
Swedish Nouns 15699.998 15414.361 285.6372265
Swedish Predicates 8369.720 8208.730 160.9902699
Swedish Function Words 7109.022 6895.170 213.8522990
Turkish Nouns 35657.010 35491.148 165.8628143
Turkish Predicates 23100.198 23101.121 -0.9226728
Turkish Function Words 18379.666 18344.529 35.1367729

Plot age effect coefficients for each language and lexical category.

ggplot(vocab.model.metrics, 
       aes(x=language, y=age.coef, fill=lexical_category)) + 
  geom_bar(position="dodge", stat="identity") + 
  geom_linerange(aes(ymin=age.coef-1.96*age.se, ymax=age.coef+1.96*age.se), 
                 position = position_dodge(width=.9)) +
  ylab("Age effect coefficient") + 
  xlab("") +
  theme_bw(base_size = 14) +
  scale_fill_brewer(palette = "Set2",
                    name = "Lexical Category") +
  theme(legend.position = c(0.115,0.82),
        legend.text = element_text(size=13),
        legend.title = element_text(size=13),
        legend.key.height = unit(1.5, "char"),
        legend.key = element_blank(),
        legend.background = element_rect(fill="transparent"),
        text = element_text(family=font))